天池工业蒸汽量预测
发布时间
阅读量:
阅读量
天池工业蒸汽量
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns#画图
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#支持向量机
from sklearn.svm import SVR
#评价标准
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures
数据聚合
train = pd.read_csv('./zhengqi_train.txt',sep = '\t')
test = pd.read_csv('./zhengqi_test.txt',sep = '\t')
#将train,test进行级联,级联之前进行处理
#给train,test增加一列
train['origin'] = 'train'
test['origin'] = 'test'
#将train和test进行融合
data_all = pd.concat([train,test])
print(data_all.shape)
data_all.head()

# 38个特征,将一些不重要的特征删除
# 特征分布情况,训练和测试数据特征分布不均匀,删除
plt.figure(figsize=(9,38*6))
for i,col in enumerate(data_all.columns[:-2]):
cond = data_all['origin'] == 'train'
train_col = data_all[col][cond] #训练数据
cond = data_all['origin'] == 'test'
test_col = data_all[col][cond] #测试数据
axes = plt.subplot(38,1,i+1)
ax = sns.kdeplot(train_col,shade = True)
sns.kdeplot(test_col,shade = True,ax = ax)
plt.legend(['train','test'])
plt.xlabel(col)

plt.figure(figsize=(9,6))
for col in data_all.columns[:-2]:
g = sns.FacetGrid(data_all,col = 'origin')
g.map(sns.distplot,col)#distribute


drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape
相关性系数
# 协方差
cov = data_all.cov()
cov.head()

# 相关性系数
corr = data_all.corr()
corr.head()

# 通过相关性系数找到7个相关性不大的属性
cond = corr.loc['target'].abs() < 0.1
drop_labels = corr.loc['target'].index[cond]
# Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
drop_labels
# 查看了属性的分布,分布不好的删除
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape

# 找出相关程度
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
mcorr = train.corr() # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
# 颜色
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()

标准化
data_all[data_all['origin'] == 'test'].describe()

data_all[data_all['origin'] == 'train'].describe()

stand = StandardScaler()
data = data_all.iloc[:,:-2]
data2 = stand.fit_transform(data)
data2

cols = data_all.columns
data_all_std = pd.DataFrame(data2,columns=cols[:-2])
data_all_std

data_all.index = np.arange(4813)
data_all

data_all_std = pd.merge(data_all_std,data_all.iloc[:,-2:],right_index=True,left_index=True)
data_all_std.head()

data_all_std.describe()

使用不同算法进行训练和测试
# 异常值
from sklearn.linear_model import RidgeCV
data_all_std.head()

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
cond = data_all_std['origin'] == 'train'
X_train = data_all_std[cond].iloc[:,:-2]
# 真实值
y_train = data_all_std[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)
cond = abs((y_train - y_ )) > y_train.std()*0.8
cond.sum()

# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)
axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')
axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')

data_all_std

# 将异常值点过滤
drop_index = cond[cond].index
print(data_all_std.shape)
data_all_std.drop(drop_index,axis = 0,inplace=True)
data_all_std.shape

def detect_model(etsimators,data):
for key,estimator in estimators.items():
estimator.fit(data[0],data[2])
y_ = estimator.predict(data[1])
mse = mean_squared_error(data[3],y_)
print('-------------------mse%s'%(key),mse)
r2 = estimator.score(data[1],data[3])
print('+++++++++++++++++++r2_score%s'%(key),r2)
print('\n')
cond = data_all_std['origin'] == 'train'
X = data_all_std[cond].iloc[:,:-2]
y = data_all_std[cond]['target']
data = train_test_split(X,y,test_size = 0.2)
estimators = {}
estimators['knn'] = KNeighborsRegressor()
estimators['linear'] = LinearRegression()
estimators['ridge'] = Ridge()
estimators['lasso'] = Lasso()
estimators['elasticnet'] = ElasticNet()
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['svm_poly'] = SVR(kernel='poly')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()
# 对于我们的测试数据而言:KNN、Lasso、ElasticNet、SVM_poly
detect_model(estimators,data)

#过滤掉效果不好的方法:线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()
cond = data_all_std['origin'] == 'train'
X_train = data_all_std[cond].iloc[:,:-2]
y_train = data_all_std[cond]['target']
cond = data_all_std['origin'] == 'test'
X_test = data_all_std[cond].iloc[:,:-2]
# 一个算法预测结果,将结果合并
y_pred = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
y_pred.append(y_)
y_ = np.mean(y_pred,axis = 0)
pd.Series(y_).to_csv('./ensemble2.txt',index =False)
# 预测的结果作为新特征,让我们的算法学习,寻找数据和目标值之间的关系
# y_ 预测值,和真实值之间差距,将预测值当成新的特征,让我们算法进行再学习
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_train)
X_train[key] = y_
y_ = model.predict(X_test)
X_test[key] = y_
# 一个算法预测结果,将结果合并
y_pred = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
y_pred.append(y_)
y_ = np.mean(y_pred,axis = 0)
pd.Series(y_).to_csv('./ensemble3.txt',index = False)
sns.distplot(y_)

y_.mean()

y_.std()

#给y_加一个噪声
y_+=np.random.randn(1925)*0.1
pd.Series(y_).to_csv('./ensemble4.txt',index = False)
对数据进行归一化
data_all.head()

data = data_all.iloc[:,:-2]
minmaxscaler = MinMaxScaler()
data3 = minmaxscaler.fit_transform(data)
data3

#归一化的数据
data_all_norm = pd.DataFrame(data3,columns = data_all.columns[:-2])
data_all_norm

#进行级联
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index = True,right_index = True)
data_all_norm.describe()

def scale_minmax(data):
return (data - data.min())/(data.max()-data.min())
#连续变量画图
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize = (4*fcols,4*frows))
i = 0
for col in data_all_norm.columns[:10]:
dat = data_all_norm[[col,'target']].dropna()
# 第一个图数据分布图
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[col],fit = stats.norm)#正太分布图
plt.title(var + 'Original')
#第二个图:skew统计分析中的属性
#skewness:偏斜系数,是对正态分布的度量
i+=1
plt.subplot(frows,fcols,i)
_ = stats.probplot(dat[col],plot = plt)#画图:偏斜度
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
plt.xlabel('')
plt.ylabel('')
#第三个图:散点图
i+=1
plt.subplot(frows,fcols,i)
# plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.scatter(dat[col],dat['target'],alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))
# !!!对数据进行了处理!!!
# 数据分布图distribution
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm);
plt.title(var+' Tramsformed')
plt.xlabel('')
# 偏斜度
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
# 散点图
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

#将数据进行box-cox转换(所有列,利用for循环)
#统计建模中常用的数据变化,让数据更加正态化,更加标准化
for col in data_all_norm.columns[:-2]:
boxcox,maxlog = stats.boxcox(data_all_norm[col]+1)#加1原因data_all_nrom最小值0,stats.boxcox返回两个值,进行接收
data_all_norm[col] = scale_minmax(boxcox)#进行归一化
过滤异常值
ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)
cond = abs(y_ - y_train)>y_train.std()
print(cond.sum)
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)
axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')
axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')

#把异常值过滤
index = cond[cond].index
data_all_norm.drop(index,axis=0,inplace = True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']
cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]
#过滤掉效果不好的方法:线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor(n_estimators = 300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators = 300)
estimators['ada'] = AdaBoostRegressor(n_estimators = 300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators = 300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators = 300)
estimators['xgb'] = XGBRegressor(n_estimators = 300)
result = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
result.append(y_)
y_ = np.mean(result,axis = 0)
pd.Series(y_).to_csv('./norm.txt',index = False)
全部评论 (0)
还没有任何评论哟~
