机器学习实战----天池蒸汽量预测(完整代码)
赛题背景
火力发电就是燃料燃烧加热水生成蒸汽,蒸汽产生的压力推动汽轮机旋转,进而带动电机旋转,产生电能。其中一系列的能量转化中,影响发电效率的核心是锅炉的燃烧效率,即加热水产生的蒸汽量。
而影响锅炉燃烧效率的因素很多,包括锅炉床温、床压、炉膛温度、压力等等。
这个赛题的目标就是给一堆锅炉传感器采集的数据(38个特征变量),然后用训练好的模型预测出蒸汽量。
因为预测值为连续型数值变量,且给定的数据都带有标签,故此问题是典型的回归预测问题。
典型的回归预测模型使用的算法包括:线性回归,岭回归,LASSO回归,决策树回归,梯度提升树回归。
1、导包与数据挖掘
1.1导包
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from lightbgm import LGBMRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
1.2 数据载入
# 读取数据
with open("./zhengqi_train.txt") as fr:
data_train = pd.read_table(fr,sep = "\t")
with open("./zhengqi_test.txt") as fr_test:
data_test = pd.read_table(fr_test,sep = "\t")
1.3 数据合并
# 合并数据,增加一列
data_train["origin"] = "train"
data_test["origin"] = "test"
# 将两个csv文件按行(axis = 0)拼接在一起,用origin做标记
data_all = pd.concat([data_train,data_test], axis = 0, ignore_index = True)
删除相关特征
# 删除相关的特征
data_all.drop(["V5","V9","V11","V17","V22","V28"], axis =1, inplace = True)
数据最大最小归一化
# 对数据做最大最小归一化(当前-最小/最大-最小)
cols_numeric = list(data_all.columns) # 每一列单独提取出来作为列表的一个维度
cols_numeric.remove("origin") # 移除标记列所有内容
# 做归一化
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
# 每一列都进行赋值
scale_cols = [col for col in cols_numeric if col != 'target']
# 列表内循环对除了target列以外的每一列都做最大最小操作
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis = 0)
画图:探查特征和标签相关信息
# 画图查看特征分布情况
fcols = 6 # 每一个特征画6个分析图吧
frows = len(cols_numeric) - 1 # 几个特征就几个分析图啦
plt.figure(figsize = (4*fcols, 4*frows))
i = 0
# 对每一组标签进行遍历,然后画图
for var in cols_numeric:
if var != 'target':
dat = data_all[[var,'target']].dropna() # 去除空值
# 开始画图,6个图
i += 1
# 第一个图:原始数据
plt.subplot(frows,fcols, i)
sns.distplot(dat[var], fit = stats.norm)
plt.title(var+' Original')
plt.xlabel('')
i +=1
# 第二个图:QQ图
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var], plot = plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
# 第三个图:散点图
plt.subplot(frows,fcols,i)
plt.plot(dat[var],dat['target'],'.',alpha = 0.5)
plt.title('cprr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
i+=1
# 第四个图:转换后的正态分布图.使线性回归模型在满足线性、正态性、独立性和方差的同时不丢失信息
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
trans_bar = scale_minmax(trans_var) # 归一化
sns.distplot(trans_bar,fit = stats.norm)
plt.title(var+' Transformed')
plt.xlabel('')
i+=1
# 转换后的QQ图
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var,plot = plt)
plt.title('skew = ' +'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
#
plt.subplot(frows,fcols,i)
plt.plot(trans_var,dat['target'],'.',alpha = 0.5)
plt.title('corr = '+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
对特征进行Box-Cox变换,使其满足正态性
Box-Cox变换是Box和Cox在1964年提出的一种广义幂变换方法,是统计建模中常用的一种数据变换,用于连续的响应变量不满足正态分布的情况。Box-Cox变换之后,可以一定程度上减小不可观测的误差和预测变量的相关性。Box-Cox变换的主要特点是引入一个参数,通过数据本身估计该参数进而确定应采取的数据变换形式,Box-Cox变换可以明显地改善数据的正态性、对称性和方差相等性,对许多实际数据都是行之有效的。
cols_transform=data_all.columns[0:-2]
for col in cols_transform:
# transform column
data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
标签数据统计转换后的数据,计算分位数画图展示(基于正态分布)
print(data_all.target.describe())
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_all.target.dropna() , fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_all.target.dropna(), plot=plt)
标签数据对数变换数据,使数据更符合正态,并画图展示
#Log Transform SalePrice to improve normality
sp = data_train.target
data_train.target1 =np.power(1.5,sp)
print(data_train.target1.describe())
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_train.target1.dropna(),fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_train.target1.dropna(), plot=plt)
获取训练和测试数据
# function to get training samples
def get_training_data():
# extract training samples
from sklearn.model_selection import train_test_split
df_train = data_all[data_all["origin"]=="train"]
df_train["label"]=data_train.target1
# split SalePrice and features
y = df_train.target
X = df_train.drop(["origin","target","label"],axis=1)
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)
return X_train,X_valid,y_train,y_valid
# extract test data (without SalePrice)
def get_test_data():
df_test = data_all[data_all["origin"]=="test"].reset_index(drop=True)
return df_test.drop(["origin","target"],axis=1)
评分函数
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff**2)
n = len(y_pred)
return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
return mean_squared_error(y_ture,y_pred)
# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)
获取异常数据,并画图
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X,y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index
print('R2=', model.score(X,y))
print('rmse=', rmse(y,y_pred))
print('mse=',mean_squared_error(y,y_pred))
print('-------------------------------------')
print('mean of residuals:', mean_resid)
print('std of residuals:',std_resid)
print('-------------------------------------')
print(len(outliers),'outliers')
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred')
ax_132 = plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred')
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
plt.legend(['Accepted','Outlier'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
1显示边缘数据
# get training data
from sklearn.linear_model import Ridge
X_train, X_valid,y_train,y_valid = get_training_data()
test=get_test_data()
# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)
# permanently remove these outliers from the data
#df_train = data_all[data_all["oringin"]=="train"]
#df_train["label"]=data_train.target1
#df_train=df_train.drop(outliers)
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)
使用删除异常的数据进行模型训练
# 前面找到了异常数据了,那我们现在就要把它们删除掉然后进行训练啦
def get_training_data_omitoutliers():
y1 = y_t.copy()
X1 = X_t.copy()
return X1,y1
采用网格搜索训练模型
# 使用机器学习进行训练的时候,我们要采用网格搜索法来调参哦,用法整起来!
from sklearn.preprocessing import StandardScaler
# 这个网格搜索法我们要输入的模型,包括模型本身,网格参数,训练数据,几折交叉,几次重复
def train_model(model, param_grid = [], X = [], y = [],
splits = 5, repeats = 5):
# 如果没有训练集,就调用函数获取
if len(y)==0:
X,y = get_training_data_omitoutliers()
# K折交叉训练
rkfold = RepeatedKFold(n_splits = splits, n_repeats = repeats)
# 如果网格参数给定的话,就执行网格搜索
if len(param_grid) > 0:
# 设置网格搜索的参数
gsearch = GridSearchCV(model, param_grid, cv = rkfold,
scoring = "neg_mean_squared_error",
verbose = 1, return_train_score = True)
# 用网格模型拟合参数啦
gsearch.fit(X,y)
# 获取最好的模型参数所对应的缩影索引
model = gsearch.best_estimator_
best_idx = gsearch.best_index_
# 最好的参数所对应的数据
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
cv_std = grid_results.loc[best_idx,'std_test_score']
# 没有网格参数的话,就不执行网格搜索,对模型使用交叉验证的分数
else:
grid_results = []
cv_results = cross_val_score(model,X,y,scoring = "neg_mean_squared_error",cv = rkfold)
cv_mean = abs(np.mean(cv_result))
cv_std = np.std(cv_results)
# 到这一步,网格模型拟合就全都搞定了,我们在这个函数里面再画一下图看看
# 把均值和方差都给整到Series数据结构里,兄弟们!
cv_score = pd.Series({'mean':cv_mean,'std':cv_std})
# 使用模型把y给预测出来啦兄弟们!
y_pred = model.predict(X)
#显示一下我们上面的数据啦兄弟们!
#模型分数,平方差,均方误差,均值标准差都给显示一下看看效果嗷
#效果好,排名一高笑的嗷嗷叫
print('----------------------------------')
print(model)
print('----------------------------------')
print('score = ', model.score(X,y))
print('rmse = ', rmse(y,y_pred))
print('mse = ', mse(y,y_pred))
print('cross_val: mean = ',cv_mean,', std = ',cv_std)
# 到底好不好,我们还是得上图,看看各个指标之间的分布怎么样
# 搞成series结构,好画图
y_pred = pd.Series(y_pred, index = y.index)
resid = y - y_pred # 真实值 - 预测值,不必过分多说
mean_resid = resid.mean() # 还记得resid是啥吗,距离之差哇
std_resid = resid.std()
z = (resid - mean_resid)/std_resid
n_outliers = sum(abs(z)>3)
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
return model, cv_score, grid_results
11
# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])
# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5
开始干回归了兄弟们!
1.岭回归.就是L2正则化!
2.Lasso回归。就是L1正则化
3.ElasticNet回归
4.SVR回归
5.KNN算法
6.GBDT算法
7.XGB算法
8.随机森林RF算法
9.多模型bagging算法
最后再来个多模型融合!学完头发掉光光啦兄弟们!
1.岭回归(L2正则化)。不会的直接百度下
# 第一个:岭回归(L2正则化)。不会的直接百度下兄弟们!
model = 'Ridge'
opt_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25) # 范围0.25到6,步长0.25
param_grid = {'alpha': alph_range}
# 开始训练模型啦兄弟们
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid = param_grid,
splits = splits, repeats = repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range,abs(grid_results['mean_test_score']),
abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
2. 第一个搞定了,下面的改改模型名称,复制粘贴就好啦!
model = 'Lasso'
opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}
opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
3. 弹性网络回归算法的代价函数结合了 Lasso 回归和岭回归的正则化方法,通过两个参数 λ 和 ρ 来控制惩罚项的大小。
model = 'ElasticNet'
opt_models[model] = ElasticNet()
param_grid = {'alpha': np.arange(1e-4,1e-3,4e-5),
'l1_ratio':np.arange(0.1,1.0,0.1),
'max_iter':[100000]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
# plt.figure()
# plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
# plt.xlabel('alpha')
# plt.ylabel('score')
4. SVM回归算法来啦
model = 'LinearSVR'
opt_models[model] = LinearSVR()
crange = np.arange(0.1,1.0,0.1)
param_grid = {'C': crange,
'max_iter':[10000]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(crange, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('C')
plt.ylabel('score')
5.KNN算法来啦!
model = 'KNeighbors'
opt_models[model] = KNeighborsRegressor()
param_grid = {'n_neighbors':np.arange(3,11,1)}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(np.arange(3,11,1), abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*1))
plt.xlabel('n_neighbors')
plt.ylabel('score')
6.GBDT算法来啦!
model = 'GradientBoosting'
opt_models[model] = GradientBoostingRegressor()
param_grid = {'n_estimators':[150,250,350],
'max_depth':[1,2,3],
'min_samples_split':[5,6,7]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
7.XGB来啦!
model = 'XGB'
opt_models[model] = XGBRegressor()
param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
8.RF随机森林模型来啦!
model = 'RandomForest'
opt_models[model] = RandomForestRegressor()
param_grid = {'n_estimators':[100,150,200],
'max_features':[8,12,16,20,24],
'min_samples_split':[2,4,6]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=5, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
上面8个算法全部搞定!如果需要其他算法,就自己实现啦。比如lightGBM
model = 'lightGBM'
opt_models[model] = LGBMRegressor()
param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
现在已经9个算法都过跑一遍了,把它们都预测一遍,然后取平均值输出!
实现这个思想需要一个函数,model_predict
先输入测试数据,和测试的y值看看分布,还有stack后面再接触
def model_predict(test_data,test_y = [], stack):
# 定义一个i 看看一共有几个模型
i = 0
# 定义y_pred,长度跟输入数据的长度一样啦,全都为0,方便后面输出测试数据的情况
# 这个地方小重要,以后每一个项目都需要自己预定义一个数组,学起来!
y_predict_total = np.zeros((test_data.shape[0],))
#遍历每一个模型啦!
for model in opt_models.key():
# 排除掉SVM算法和KNN算法,因为效果太差了。不加进来了,数值就简单保留就好吧
if model!="LinearSVR" and model!="KNeighbors":
y_predict = opt_models[model].predict(test_data)
y_predict_total +=y_predict
i+=1
# 输出均方误差
if len(test_y)>0:
print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
y_predict_mean = np.round(y_predict_total/i, 3)
# 再输出一下平均值的均方误差
if len(test_y) > 0:
print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
else:
y_predict_mean = pd.Series(y_predict_mean)
return y_predict_mean
先看看训练集的预测值,搞个分数来瞧瞧
model_predict(X_valid,y_valid)
没有问题,没有毛病,mse0.12多,直接上测试集输出提交啦xdm!
y_ = model_predict(test)
y_.to_csv('./predict_.txt',header = None,index = None)
