CatBoost道路风险预测学习笔记（对应的ipynb的markdown版本）

发表于 2025-11-03 更新于 2025-11-04 分类于学习阅读次数：本文字数： 2.6k 阅读时长 ≈ 9 分钟

这篇笔记基于Kaggle竞赛数据，详细记录了使用CatBoost进行道路风险预测的完整流程，包括第三方库导入与版本查看、数据集导入与清洗、数据可视化分析（目标分布和相关性热力图）、CatBoost模型构建（设置1000次迭代、0.1学习率、6层深度等参数并启用GPU加速）、模型评估（通过RMSE和5折交叉验证）、特征重要性分析（速度限制、光照条件和曲率是最重要的三个特征）、预测结果可视化（实际值vs预测值散点图）以及提交结果生成（创建包含id和accident_risk的submission.csv文件）。

第三方库导入和数据集的导入，数据集的查看

第三方库的导入和输出对应的版本编号

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from IPython.core.display import HTML



def show_object_columns_info(df):
    "显示DataFrame中所有object类型列的信息，包括列名、唯一值数量和所有唯一值"
    object_cols = df.select_dtypes(include=['object']).columns
    print(object_cols)
    for col in object_cols:
        print(f"列名: {col}")
        print(f'object数量: {df[col].nunique()}')
        print(df[col].unique())

print(sns.__version__)
print(pd.__version__)
print(np.__version__)

0.13.2
2.3.3
1.26.4

数据的导入，数据集的描述和清洗

train_path = "../../datasets/competition_datas/Accident_Risk/train.csv"
test_path = "../../datasets/competition_datas/Accident_Risk/test.csv"

train_ds = pd.read_csv(train_path) #ds: dataset
#train_ds.info()
#print(train_ds.shape)
#train_ds.describe()
train_ds.head()
# 使用函数显示object类型列的信息
#show_object_columns_info(train_ds)

	id	road_type	num_lanes	curvature	speed_limit	lighting	weather	road_signs_present	public_road	time_of_day	holiday	school_season	num_reported_accidents	accident_risk
0	0	urban	2	0.06	35	daylight	rainy	False	True	afternoon	False	True	1	0.13
1	1	urban	4	0.99	35	daylight	clear	True	False	evening	True	True	0	0.35
2	2	rural	4	0.63	70	dim	clear	False	True	morning	True	False	2	0.30
3	3	highway	4	0.07	35	dim	rainy	True	True	morning	False	False	1	0.21
4	4	rural	1	0.58	60	daylight	foggy	False	False	evening	True	False	1	0.56

1
2
3

test_ds = pd.read_csv(test_path)
#print(test_ds.info())
test_ds.head()

	id	road_type	num_lanes	curvature	speed_limit	lighting	weather	road_signs_present	public_road	time_of_day	holiday	school_season	num_reported_accidents
0	517754	highway	2	0.34	45	night	clear	True	True	afternoon	True	True	1
1	517755	urban	3	0.04	45	dim	foggy	True	False	afternoon	True	False	0
2	517756	urban	2	0.59	35	dim	clear	True	False	afternoon	True	True	1
3	517757	rural	4	0.95	35	daylight	rainy	False	False	afternoon	False	False	2
4	517758	highway	2	0.86	35	daylight	clear	True	False	evening	False	True	3

数据的可视化(了解数据，并不是选择特征)

由于上述数据并没有缺失值，因此我们不用处理缺失值.直接使用可视化工具进行可视化分析.

y = train_ds['accident_risk']
X = train_ds.drop(columns=['accident_risk', 'id'],axis=1)

X.info()
display(X.head())
X_num = X.select_dtypes(include=['int64', 'float64'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   road_type               517754 non-null  object 
 1   num_lanes               517754 non-null  int64  
 2   curvature               517754 non-null  float64
 3   speed_limit             517754 non-null  int64  
 4   lighting                517754 non-null  object 
 5   weather                 517754 non-null  object 
 6   road_signs_present      517754 non-null  bool   
 7   public_road             517754 non-null  bool   
 8   time_of_day             517754 non-null  object 
 9   holiday                 517754 non-null  bool   
 10  school_season           517754 non-null  bool   
 11  num_reported_accidents  517754 non-null  int64  
dtypes: bool(4), float64(1), int64(3), object(4)
memory usage: 33.6+ MB

	road_type	num_lanes	curvature	speed_limit	lighting	weather	road_signs_present	public_road	time_of_day	holiday	school_season	num_reported_accidents
0	urban	2	0.06	35	daylight	rainy	False	True	afternoon	False	True	1
1	urban	4	0.99	35	daylight	clear	True	False	evening	True	True	0
2	rural	4	0.63	70	dim	clear	False	True	morning	True	False	2
3	highway	4	0.07	35	dim	rainy	True	True	morning	False	False	1
4	rural	1	0.58	60	daylight	foggy	False	False	evening	True	False	1

目标分布

plt.figure()
sns.histplot(y, kde=True, bins=50)
plt.title('Distribution of Accident Risk')
plt.xlabel('Accident Risk')
plt.show()

png

X_corr = X_num.corr()
plt.figure()
sns.heatmap(X_corr, annot=True, cmap='viridis', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

png

catboost

#display(X.head())
#display(y.head())

#print(X.dtypes)
# 将X中的bool类型转化为object(str)类型,除了数值类型
X_bool_cols = X.select_dtypes(include=['bool']).columns
print(X_bool_cols)

#print(X_bool_cols)
for col in X_bool_cols:
    X[col] = X[col].astype(str)
    test_ds[col] = test_ds[col].astype(str)
    
cat_features = X.select_dtypes(include=['object']).columns.tolist()
print(cat_features)
#print(X.dtypes)

Index(['road_signs_present', 'public_road', 'holiday', 'school_season'], dtype='object')
['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']

# 分割数据集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(414203, 12) (103551, 12) (414203,) (103551,)

params = {
    'iterations': 1000, # 迭代次数
    'learning_rate': 0.1, # 学习率
    'depth': 6, # 树的深度
    'cat_features': cat_features, # 分类特征
    'random_seed': 42, # 随机种子
    'loss_function': 'RMSE', # 损失函数
    'verbose': 100, # 每100次迭代打印一次信息
    'early_stopping_rounds': 50, # 早停轮数
    "task_type": "GPU", # 启用 GPU 
    "devices": "0" # 指定 GPU 设备
}

Catboost_model = CatBoostRegressor(**params)
Catboost_model.fit(X, y, eval_set=(X_val, y_val)) # eval_set 用于监控模型在验证集上的性能 没有则不会显示性能表现

0:	learn: 0.1523865	test: 0.1523846	best: 0.1523846 (0)	total: 94.3ms	remaining: 1m 34s
100:	learn: 0.0564065	test: 0.0564811	best: 0.0564811 (100)	total: 8.16s	remaining: 1m 12s
200:	learn: 0.0561983	test: 0.0563075	best: 0.0563075 (200)	total: 17s	remaining: 1m 7s
300:	learn: 0.0560877	test: 0.0562139	best: 0.0562139 (300)	total: 26s	remaining: 1m
400:	learn: 0.0560011	test: 0.0561416	best: 0.0561416 (400)	total: 34.5s	remaining: 51.5s
500:	learn: 0.0559323	test: 0.0560808	best: 0.0560808 (500)	total: 43.1s	remaining: 42.9s
600:	learn: 0.0558682	test: 0.0560278	best: 0.0560278 (600)	total: 52.6s	remaining: 34.9s
700:	learn: 0.0558153	test: 0.0559829	best: 0.0559829 (700)	total: 1m 1s	remaining: 26.2s
800:	learn: 0.0557594	test: 0.0559401	best: 0.0559401 (800)	total: 1m 10s	remaining: 17.4s
900:	learn: 0.0557073	test: 0.0559001	best: 0.0559001 (899)	total: 1m 19s	remaining: 8.72s
999:	learn: 0.0556659	test: 0.0558659	best: 0.0558659 (999)	total: 1m 29s	remaining: 0us
bestTest = 0.05586587195
bestIteration = 999





<catboost.core.CatBoostRegressor at 0x257b5932e50>

# Make predictions on validation set 
y_pred = Catboost_model.predict(X_val)  
# Calculate RMSE 
val_rmse = mean_squared_error(y_val, y_pred)
print(f"Validation RMSE: {val_rmse:.4f}")

# Cross-validation 
cv_scores = cross_val_score(Catboost_model, X, y, cv=5, scoring='neg_mean_squared_error')  # 5折交叉验证 # 计算每个折的 负均方误差
cv_rmse = np.sqrt(-cv_scores)  # 计算每个折的 RMSE 并取平方根
print(f"Cross-validation RMSE: {cv_rmse.mean():.4f} (+/- {cv_rmse.std() * 2:.4f})")

Validation RMSE: 0.0031
0:	learn: 0.1523602	total: 74.6ms	remaining: 1m 14s
100:	learn: 0.0563163	total: 7.29s	remaining: 1m 4s
200:	learn: 0.0560969	total: 14.1s	remaining: 56.1s
300:	learn: 0.0559734	total: 20.9s	remaining: 48.6s
400:	learn: 0.0558743	total: 29.3s	remaining: 43.8s
500:	learn: 0.0557934	total: 36.8s	remaining: 36.7s
600:	learn: 0.0557296	total: 45.4s	remaining: 30.1s
700:	learn: 0.0556612	total: 53s	remaining: 22.6s
800:	learn: 0.0556026	total: 1m	remaining: 15s
900:	learn: 0.0555548	total: 1m 10s	remaining: 7.79s
999:	learn: 0.0555088	total: 1m 32s	remaining: 0us
0:	learn: 0.1524152	total: 78.6ms	remaining: 1m 18s
100:	learn: 0.0564793	total: 10.3s	remaining: 1m 31s
200:	learn: 0.0562581	total: 21.2s	remaining: 1m 24s
300:	learn: 0.0561289	total: 33.6s	remaining: 1m 17s
400:	learn: 0.0560335	total: 44s	remaining: 1m 5s
500:	learn: 0.0559565	total: 57.6s	remaining: 57.3s
600:	learn: 0.0558867	total: 1m 13s	remaining: 48.6s
700:	learn: 0.0558220	total: 1m 26s	remaining: 36.9s
800:	learn: 0.0557681	total: 1m 38s	remaining: 24.5s
900:	learn: 0.0557146	total: 1m 50s	remaining: 12.2s
999:	learn: 0.0556690	total: 2m 4s	remaining: 0us
0:	learn: 0.1523094	total: 71.2ms	remaining: 1m 11s
100:	learn: 0.0563526	total: 11.3s	remaining: 1m 40s
200:	learn: 0.0561467	total: 23.4s	remaining: 1m 33s
300:	learn: 0.0560148	total: 35.4s	remaining: 1m 22s
400:	learn: 0.0559182	total: 49.6s	remaining: 1m 14s
500:	learn: 0.0558331	total: 1m 2s	remaining: 1m 2s
600:	learn: 0.0557582	total: 1m 16s	remaining: 50.5s
700:	learn: 0.0556941	total: 1m 28s	remaining: 37.7s
800:	learn: 0.0556372	total: 1m 39s	remaining: 24.8s
900:	learn: 0.0555797	total: 1m 50s	remaining: 12.2s
999:	learn: 0.0555314	total: 2m 1s	remaining: 0us
0:	learn: 0.1524123	total: 80ms	remaining: 1m 19s
100:	learn: 0.0563271	total: 8.31s	remaining: 1m 13s
200:	learn: 0.0561321	total: 19.8s	remaining: 1m 18s
300:	learn: 0.0560138	total: 35.6s	remaining: 1m 22s
400:	learn: 0.0559252	total: 48.3s	remaining: 1m 12s
500:	learn: 0.0558532	total: 59.8s	remaining: 59.6s
600:	learn: 0.0557891	total: 1m 11s	remaining: 47.2s
700:	learn: 0.0557296	total: 1m 23s	remaining: 35.5s
800:	learn: 0.0556794	total: 1m 35s	remaining: 23.7s
900:	learn: 0.0556325	total: 1m 45s	remaining: 11.6s
999:	learn: 0.0555885	total: 1m 55s	remaining: 0us
0:	learn: 0.1523530	total: 80.1ms	remaining: 1m 19s
100:	learn: 0.0564252	total: 10.3s	remaining: 1m 31s
200:	learn: 0.0562220	total: 22.4s	remaining: 1m 29s
300:	learn: 0.0561025	total: 31.7s	remaining: 1m 13s
400:	learn: 0.0559980	total: 41.2s	remaining: 1m 1s
500:	learn: 0.0559200	total: 51.8s	remaining: 51.6s
600:	learn: 0.0558540	total: 1m 1s	remaining: 40.9s
700:	learn: 0.0557899	total: 1m 14s	remaining: 31.9s
800:	learn: 0.0557331	total: 1m 25s	remaining: 21.2s
900:	learn: 0.0556776	total: 1m 39s	remaining: 11s
999:	learn: 0.0556246	total: 1m 52s	remaining: 0us
Cross-validation RMSE: 0.0561 (+/- 0.0005)

# Feature Importance
feature_importance = Catboost_model.get_feature_importance()  # index类型
#print(feature_importance)
feature_names = X.columns # ndarray类型
#print(feature_names)
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False) # 按重要性降序排序

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

[2.68670072e-02 5.57111260e-02 1.53970212e+01 3.71319477e+01
 3.42504222e+01 9.16493355e+00 2.00611428e-02 4.68708926e-02
 2.95851141e-02 5.71107703e-02 1.50494424e-02 3.80441977e+00]
Index(['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents'],
      dtype='object')

Top 10 Most Important Features:
                   feature  importance
3              speed_limit   37.131948
4                 lighting   34.250422
2                curvature   15.397021
5                  weather    9.164934
11  num_reported_accidents    3.804420
9                  holiday    0.057111
1                num_lanes    0.055711
7              public_road    0.046871
8              time_of_day    0.029585
0                road_type    0.026867

结果可视化

C:\Windows\Temp\ipykernel_27588\3889204982.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importance_df.head(10), x='importance', y='feature', palette='viridis')

png

# Plot actual vs predicted values
plt.scatter(y_val, y_pred, alpha=0.1, marker='.', s=5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

png

提交结果

# Prepare test data (excluding the id column for prediction)

test_features = test_ds.drop('id', axis=1)

# Make predictions on test set
test_predictions = Catboost_model.predict(test_features)

# Create submission file
submission = pd.DataFrame({
    'id': test_ds['id'],
    'accident_risk': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)

print(f"Submission shape: {submission.shape}")
print("\nFirst 5 rows of submission:")
display(submission.head())

Submission shape: (172585, 2)

First 5 rows of submission:

	id	accident_risk
0	517754	0.291503
1	517755	0.122281
2	517756	0.187375
3	517757	0.313193
4	517758	0.398013